import pandas as pd

# 目标文件
file_path = "./dataset/all_data.csv"

# 计数器
positive_rows = []
negative_rows = []

# 假设：label列标注情感（1为正面，0为负面），且文本列叫 "review"
chunk_size = 100000
reader = pd.read_csv(file_path, chunksize=chunk_size)

for chunk in reader:
    pos_chunk = chunk[chunk['label'] == 1]
    neg_chunk = chunk[chunk['label'] == 0]

    positive_rows.append(pos_chunk)
    negative_rows.append(neg_chunk)

    # 提前结束，如果收集到的足够了
    total_pos = sum(len(df) for df in positive_rows)
    total_neg = sum(len(df) for df in negative_rows)
    if total_pos > 100000 and total_neg > 100000:
        break

# 合并并采样
positive_df = pd.concat(positive_rows).sample(n=50000, random_state=42)
negative_df = pd.concat(negative_rows).sample(n=50000, random_state=42)

# 合并为一个新数据集
small_df = pd.concat([positive_df, negative_df])
small_df = small_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 保存为新文件
small_df.to_csv("small_data.csv", index=False)

print("完成：small_data.csv 约为 100MB，正负评论各占一半")
